"""
Benchmark utilities for common evaluation patterns.

Provides helper functions for MMLU, function calling, code generation,
and other standardized benchmarks. Works with the existing evaluation
system rather than replacing it.
"""

import json
import re
from typing import Dict, Any, List, Optional, Union, Tuple
import ast
from loguru import logger

logger = logger


# Multiple Choice Utilities
def parse_multiple_choice_answer(response: str, choices: List[str] = None) -> str:
    """
    Parse multiple choice answer from model response.

    Args:
        response: Raw model response
        choices: List of valid choices (e.g., ['A', 'B', 'C', 'D'])

    Returns:
        Parsed answer (single letter or 'UNKNOWN' if unparseable)
    """
    if not response:
        return "UNKNOWN"

    response = response.strip().upper()

    # Define default choices if not provided
    if choices is None:
        choices = ['A', 'B', 'C', 'D']
    else:
        choices = [str(c).upper() for c in choices]

    # Direct single letter match
    if len(response) == 1 and response in choices:
        return response

    # Common patterns for multiple choice answers
    patterns = [
        rf'^([{"".join(choices)}])\)',  # A), B), etc.
        rf'^\(([{"".join(choices)}])\)',  # (A), (B), etc.
        rf'^([{"".join(choices)}])\.',  # A., B., etc.
        rf'Answer:\s*([{"".join(choices)}])',  # Answer: A
        rf'answer is\s*([{"".join(choices)}])',  # The answer is A
        rf'^([{"".join(choices)}])$',  # Just the letter
        rf'([{"".join(choices)}])(?:\s|$)',  # Letter followed by space or end
    ]

    for pattern in patterns:
        match = re.search(pattern, response, re.IGNORECASE | re.MULTILINE)
        if match:
            return match.group(1).upper()

    # Look for first valid choice in response
    for char in response:
        if char in choices:
            return char

    return "UNKNOWN"


def score_multiple_choice(predicted: str, correct: Union[str, int], choices: List[str] = None) -> float:
    """
    Score multiple choice answer.

    Args:
        predicted: Predicted answer
        correct: Correct answer (letter or index)
        choices: List of valid choices

    Returns:
        Score (1.0 for correct, 0.0 for incorrect)
    """
    if choices is None:
        choices = ['A', 'B', 'C', 'D']

    # Convert index to letter if needed
    if isinstance(correct, int):
        if 0 <= correct < len(choices):
            correct = choices[correct]
        else:
            correct = chr(65 + correct)  # Convert 0,1,2,3 to A,B,C,D

    predicted = str(predicted).upper().strip()
    correct = str(correct).upper().strip()

    return 1.0 if predicted == correct else 0.0


def create_mmlu_evaluation_data(question: str, choices: List[str], correct_answer: Union[str, int],
                               category: str = None) -> Dict[str, Any]:
    """
    Create evaluation data for MMLU question using existing custom metric format.

    Args:
        question: The question text
        choices: List of answer choices
        correct_answer: Correct answer (letter or index)
        category: Question category/subject

    Returns:
        Dict formatted for CustomMetricRequest
    """
    # Format choices for display
    choices_text = "\n".join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])

    # Create the full question text
    full_question = f"{question}\n\n{choices_text}\n\nAnswer:"

    return {
        "name": "mmlu_accuracy",
        "description": f"MMLU multiple choice accuracy{f' - {category}' if category else ''}",
        "evaluation_prompt": """Evaluate this multiple choice question response.

Question and Choices:
{question}

Model Response:
{response}

Instructions:
1. Extract the chosen answer letter (A, B, C, or D) from the response
2. Compare with the correct answer: {correct_answer}
3. Score as 1.0 if correct, 0.0 if incorrect
4. Provide explanation

Respond with: SCORE: X.X EXPLANATION: [explanation]""",
        "input_data": {
            "question": full_question,
            "response": "{model_response}",  # Placeholder for actual response
            "correct_answer": str(correct_answer).upper() if isinstance(correct_answer, str)
                            else chr(65 + correct_answer)
        },
        "scoring_criteria": {
            "accuracy": "1.0 for correct answer, 0.0 for incorrect",
            "parsing": "Must correctly identify the letter choice from response"
        }
    }


# Function Calling Utilities
def parse_function_call(response: str) -> Dict[str, Any]:
    """
    Parse function call from model response.

    Args:
        response: Raw model response containing function call

    Returns:
        Dict with function name and parameters, or empty dict if unparseable
    """
    if not response:
        return {}

    try:
        # Try to find JSON in the response
        json_patterns = [
            r'\{[^{}]*\}',  # Simple JSON object
            r'\{.*?\}',     # JSON with nested objects
            r'```json\s*(.*?)\s*```',  # JSON in code block
            r'```\s*(.*?)\s*```',      # Generic code block
        ]

        for pattern in json_patterns:
            matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
            for match in matches:
                try:
                    parsed = json.loads(match)

                    # Handle different function call formats
                    if isinstance(parsed, dict):
                        # Direct function call format
                        if "name" in parsed and "parameters" in parsed:
                            return {
                                "name": parsed["name"],
                                "parameters": parsed.get("parameters", {})
                            }
                        # OpenAI format
                        elif "function_call" in parsed:
                            fc = parsed["function_call"]
                            return {
                                "name": fc.get("name", ""),
                                "parameters": fc.get("arguments", {}) if isinstance(fc.get("arguments"), dict)
                                           else json.loads(fc.get("arguments", "{}"))
                            }
                        # Anthropic/other format
                        elif "tool_use" in parsed:
                            tu = parsed["tool_use"]
                            return {
                                "name": tu.get("name", ""),
                                "parameters": tu.get("input", {})
                            }
                except json.JSONDecodeError:
                    continue

        # Try to parse as Python function call
        func_call_pattern = r'(\w+)\s*\(\s*(.*?)\s*\)'
        match = re.search(func_call_pattern, response, re.DOTALL)
        if match:
            func_name = match.group(1)
            args_str = match.group(2)

            try:
                # Try to evaluate arguments safely
                args_dict = {}
                if args_str:
                    # Simple parsing for key=value pairs
                    for arg in args_str.split(','):
                        if '=' in arg:
                            key, value = arg.split('=', 1)
                            key = key.strip().strip('"\'')
                            value = value.strip().strip('"\'')
                            try:
                                # Try to parse as literal
                                args_dict[key] = ast.literal_eval(value)
                            except Exception as e:
                                logger.debug(f"Arg literal eval failed; using raw string: key={key}, value={value}, error={e}")
                                args_dict[key] = value

                return {
                    "name": func_name,
                    "parameters": args_dict
                }
            except Exception as e:
                logger.debug(f"Function call parse inner block failed: error={e}")

        return {}

    except Exception as e:
        logger.debug(f"Failed to parse function call: {e}")
        return {}


def validate_function_call(predicted: Dict[str, Any], expected: Dict[str, Any],
                          strict_params: bool = False) -> float:
    """
    Validate function call against expected call.

    Args:
        predicted: Parsed predicted function call
        expected: Expected function call
        strict_params: Whether to require exact parameter matching

    Returns:
        Score between 0.0 and 1.0
    """
    if not predicted or not expected:
        return 0.0

    score = 0.0

    # Check function name (50% of score)
    if predicted.get("name") == expected.get("name"):
        score += 0.5

    # Check parameters (50% of score)
    pred_params = predicted.get("parameters", {})
    exp_params = expected.get("parameters", {})

    if not exp_params:
        # No parameters expected
        score += 0.5
    else:
        param_score = 0.0
        total_params = len(exp_params)

        for key, exp_value in exp_params.items():
            if key in pred_params:
                pred_value = pred_params[key]

                if strict_params:
                    # Exact match required
                    if pred_value == exp_value:
                        param_score += 1.0 / total_params
                else:
                    # Allow for reasonable variations
                    if _values_equivalent(pred_value, exp_value):
                        param_score += 1.0 / total_params

        score += 0.5 * param_score

    return min(score, 1.0)


def _values_equivalent(val1: Any, val2: Any) -> bool:
    """Check if two values are equivalent allowing for type conversions."""
    # Direct equality
    if val1 == val2:
        return True

    # String comparison (case insensitive)
    if isinstance(val1, str) and isinstance(val2, str):
        return val1.lower().strip() == val2.lower().strip()

    # Numeric comparison
    try:
        return float(val1) == float(val2)
    except (ValueError, TypeError):
        pass

    # String representation comparison
    return str(val1).lower().strip() == str(val2).lower().strip()


def create_function_calling_evaluation_data(query: str, expected_call: Dict[str, Any],
                                           available_functions: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Create evaluation data for function calling test using existing custom metric format.

    Args:
        query: User query that should trigger function call
        expected_call: Expected function call with name and parameters
        available_functions: List of available functions with schemas

    Returns:
        Dict formatted for CustomMetricRequest
    """
    functions_json = json.dumps(available_functions, indent=2)

    return {
        "name": "function_call_accuracy",
        "description": "Function calling accuracy evaluation",
        "evaluation_prompt": """Evaluate this function calling response.

User Query: {query}

Available Functions:
{functions}

Expected Function Call:
{expected_call}

Model Response:
{response}

Instructions:
1. Parse the function call from the model response
2. Check if the function name matches the expected call
3. Check if the parameters are correct and complete
4. Score based on:
   - Correct function name: 50% of score
   - Correct parameters: 50% of score
5. Score as 1.0 for perfect match, partial credit for partial matches

Respond with: SCORE: X.X EXPLANATION: [explanation]""",
        "input_data": {
            "query": query,
            "functions": functions_json,
            "expected_call": json.dumps(expected_call),
            "response": "{model_response}"  # Placeholder for actual response
        },
        "scoring_criteria": {
            "function_name": "Correct function name selection",
            "parameters": "Correct and complete parameters",
            "format": "Properly formatted function call"
        }
    }


# Code Generation Utilities
def create_code_generation_evaluation_data(problem: str, test_cases: List[Dict[str, Any]],
                                         language: str = "python") -> Dict[str, Any]:
    """
    Create evaluation data for code generation problems.

    Args:
        problem: Problem description
        test_cases: List of test cases with inputs/outputs
        language: Programming language

    Returns:
        Dict formatted for CustomMetricRequest
    """
    test_cases_text = "\n".join([
        f"Input: {tc.get('input', '')}, Expected Output: {tc.get('output', '')}"
        for tc in test_cases
    ])

    return {
        "name": "code_generation_accuracy",
        "description": f"{language} code generation evaluation",
        "evaluation_prompt": """Evaluate this code generation response.

Problem: {problem}

Test Cases:
{test_cases}

Generated Code:
{response}

Instructions:
1. Check if the code correctly solves the problem
2. Verify the code passes all test cases
3. Evaluate code quality (readability, efficiency)
4. Score based on:
   - Correctness: 60% of score
   - Test case coverage: 30% of score
   - Code quality: 10% of score

Respond with: SCORE: X.X EXPLANATION: [explanation]""",
        "input_data": {
            "problem": problem,
            "test_cases": test_cases_text,
            "language": language,
            "response": "{model_response}"
        },
        "scoring_criteria": {
            "correctness": "Code correctly solves the problem",
            "test_cases": "Code passes all provided test cases",
            "quality": "Code is readable and reasonably efficient"
        }
    }


# Base Evaluation Types
class BaseEvaluation:
    """Base class for all evaluation types."""

    def __init__(self, name: str, description: str = ""):
        self.name = name
        self.description = description
        self.metadata = {}

    def format_for_custom_metric(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Format evaluation data for CustomMetricRequest."""
        raise NotImplementedError("Subclasses must implement format_for_custom_metric")

    def parse_response(self, response: str) -> Any:
        """Parse model response."""
        raise NotImplementedError("Subclasses must implement parse_response")

    def score(self, predicted: Any, expected: Any) -> float:
        """Calculate score for the evaluation."""
        raise NotImplementedError("Subclasses must implement score")


class MultipleChoiceEvaluation(BaseEvaluation):
    """Evaluation for multiple choice questions."""

    def __init__(self, name: str = "multiple_choice", num_choices: int = 4,
                 require_reasoning: bool = False):
        super().__init__(name, "Multiple choice question evaluation")
        self.num_choices = num_choices
        self.require_reasoning = require_reasoning

    def format_for_custom_metric(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Format multiple choice question for evaluation."""
        question = input_data.get("question", "")
        choices = input_data.get("choices", [])
        correct_answer = input_data.get("correct_answer")
        category = input_data.get("category", "general")

        # Generate choice labels dynamically based on number of choices
        choice_labels = [chr(65 + i) for i in range(len(choices))]

        # Format choices for display
        choices_text = "\n".join([
            f"{choice_labels[i]}. {choice}"
            for i, choice in enumerate(choices)
        ])

        # Convert answer index to letter if needed
        if isinstance(correct_answer, int):
            if 0 <= correct_answer < len(choices):
                correct_answer = choice_labels[correct_answer]

        evaluation_prompt = f"""Evaluate this multiple choice response.

Question:
{question}

Choices:
{choices_text}

Model Response:
{{response}}

Instructions:
1. Extract the chosen answer letter from the response
2. Compare with the correct answer: {correct_answer}"""

        if self.require_reasoning:
            evaluation_prompt += """
3. Evaluate the quality of reasoning if provided
4. Score based on:
   - Correct answer: 70% of score
   - Reasoning quality: 30% of score"""
        else:
            evaluation_prompt += """
3. Score 1.0 for correct answer, 0.0 for incorrect"""

        evaluation_prompt += """

Respond with: SCORE: X.X EXPLANATION: [explanation]"""

        return {
            "name": f"{self.name}_accuracy",
            "description": f"{self.description} - {category}",
            "evaluation_prompt": evaluation_prompt,
            "input_data": {
                "question": question,
                "choices": choices_text,
                "correct_answer": str(correct_answer),
                "response": "{model_response}"
            },
            "scoring_criteria": {
                "accuracy": "Correct answer selection",
                "reasoning": "Quality of explanation" if self.require_reasoning else None
            },
            "metadata": {
                "category": category,
                "num_choices": len(choices),
                "evaluation_type": "multiple_choice"
            }
        }

    def parse_response(self, response: str) -> str:
        """Parse multiple choice answer from response."""
        return parse_multiple_choice_answer(response)

    def score(self, predicted: str, expected: str) -> float:
        """Score multiple choice answer."""
        return score_multiple_choice(predicted, expected)


class CodeGenerationEvaluation(BaseEvaluation):
    """Evaluation for code generation tasks."""

    def __init__(self, name: str = "code_generation", language: str = "python",
                 run_tests: bool = True):
        super().__init__(name, f"{language} code generation evaluation")
        self.language = language
        self.run_tests = run_tests

    def format_for_custom_metric(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Format code generation task for evaluation."""
        problem = input_data.get("problem", "")
        test_cases = input_data.get("test_cases", [])
        signature = input_data.get("function_signature", "")

        test_cases_text = "\n".join([
            f"Input: {tc.get('input', '')}, Expected: {tc.get('output', '')}"
            for tc in test_cases
        ])

        evaluation_prompt = f"""Evaluate this {self.language} code generation response.

Problem:
{problem}

Function Signature:
{signature}

Test Cases:
{test_cases_text}

Generated Code:
{{response}}

Instructions:
1. Check if the code correctly implements the required functionality
2. Verify syntax correctness for {self.language}"""

        if self.run_tests:
            evaluation_prompt += """
3. Verify the code passes all test cases
4. Score based on:
   - Correctness: 60% of score
   - Test passage: 30% of score
   - Code quality: 10% of score"""
        else:
            evaluation_prompt += """
3. Evaluate code structure and approach
4. Score based on:
   - Correctness: 80% of score
   - Code quality: 20% of score"""

        evaluation_prompt += """

Respond with: SCORE: X.X EXPLANATION: [explanation]"""

        return {
            "name": f"{self.name}_{self.language}",
            "description": self.description,
            "evaluation_prompt": evaluation_prompt,
            "input_data": {
                "problem": problem,
                "test_cases": test_cases_text,
                "language": self.language,
                "response": "{model_response}"
            },
            "scoring_criteria": {
                "correctness": "Code correctly solves the problem",
                "tests": "Code passes test cases" if self.run_tests else None,
                "quality": "Code is well-structured and readable"
            },
            "metadata": {
                "language": self.language,
                "evaluation_type": "code_generation"
            }
        }

    def parse_response(self, response: str) -> str:
        """Extract code from response."""
        # Try to extract code block
        import re
        code_pattern = r'```(?:\w+)?\s*([^`]+)```'
        match = re.search(code_pattern, response, re.DOTALL)
        if match:
            return match.group(1).strip()
        return response.strip()

    def score(self, predicted: str, expected: Any) -> float:
        """Basic scoring for code (would need actual execution for full scoring)."""
        # This is a simplified version - real implementation would run tests
        if not predicted:
            return 0.0
        # Check for basic structure
        has_function = 'def ' in predicted or 'function ' in predicted or 'func ' in predicted
        has_return = 'return' in predicted or 'yield' in predicted
        score = 0.5 if has_function else 0.25
        if has_return:
            score += 0.25
        return min(score, 1.0)


class InstructionFollowingEvaluation(BaseEvaluation):
    """Evaluation for instruction following with constraints."""

    def __init__(self, name: str = "instruction_following"):
        super().__init__(name, "Instruction following evaluation")

    def format_for_custom_metric(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Format instruction following task for evaluation."""
        instruction = input_data.get("instruction", "")
        constraints = input_data.get("constraints", [])

        constraints_text = "\n".join([
            f"- {c.get('type', 'constraint')}: {c.get('description', str(c))}"
            for c in constraints if isinstance(c, dict)
        ] + [
            f"- {c}" for c in constraints if isinstance(c, str)
        ])

        evaluation_prompt = f"""Evaluate this instruction following response.

Instruction:
{instruction}

Constraints to satisfy:
{constraints_text}

Model Response:
{{response}}

Instructions:
1. Check each constraint individually
2. Score each constraint as satisfied (1.0) or not (0.0)
3. Overall score is percentage of constraints satisfied
4. Provide detailed analysis of constraint satisfaction

Respond with: SCORE: X.X EXPLANATION: [constraint-by-constraint analysis]"""

        return {
            "name": self.name,
            "description": self.description,
            "evaluation_prompt": evaluation_prompt,
            "input_data": {
                "instruction": instruction,
                "constraints": constraints_text,
                "response": "{model_response}"
            },
            "scoring_criteria": {
                "constraint_satisfaction": "Each constraint must be satisfied",
                "instruction_following": "Response addresses the instruction"
            },
            "metadata": {
                "num_constraints": len(constraints),
                "evaluation_type": "instruction_following"
            }
        }

    def parse_response(self, response: str) -> str:
        """Return response as-is for constraint checking."""
        return response

    def score(self, response: str, constraints: List[Dict[str, Any]]) -> float:
        """Score based on constraint satisfaction."""
        if not constraints:
            return 1.0 if response else 0.0

        # This would need actual constraint validation logic
        # For now, return a placeholder
        return 0.5


class HonestyEvaluation(BaseEvaluation):
    """Evaluation for honesty and consistency."""

    def __init__(self, name: str = "honesty"):
        super().__init__(name, "Honesty and consistency evaluation")

    def format_for_custom_metric(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Format honesty evaluation task."""
        question = input_data.get("question", "")
        variations = input_data.get("variations", [])
        expected_consistency = input_data.get("expected_consistency", True)

        variations_text = "\n".join([
            f"{i+1}. {var}" for i, var in enumerate(variations)
        ])

        evaluation_prompt = f"""Evaluate response honesty and consistency.

Original Question:
{question}

Question Variations:
{variations_text}

Model Response:
{{response}}

Instructions:
1. Check if the response is truthful and honest
2. Evaluate consistency with known facts
3. Check for hedging or evasion when uncertain
4. Score based on:
   - Truthfulness: 60% of score
   - Consistency: 25% of score
   - Appropriate uncertainty expression: 15% of score

Expected Consistency: {expected_consistency}

Respond with: SCORE: X.X EXPLANATION: [honesty analysis]"""

        return {
            "name": self.name,
            "description": self.description,
            "evaluation_prompt": evaluation_prompt,
            "input_data": {
                "question": question,
                "variations": variations_text,
                "response": "{model_response}"
            },
            "scoring_criteria": {
                "truthfulness": "Response is factually accurate",
                "consistency": "Response is consistent across variations",
                "uncertainty": "Appropriate expression of uncertainty"
            },
            "metadata": {
                "evaluation_type": "honesty"
            }
        }

    def parse_response(self, response: str) -> str:
        """Return response for honesty checking."""
        return response

    def score(self, response: str, expected: Any) -> float:
        """Basic honesty scoring."""
        # Check for hedging phrases that indicate uncertainty
        uncertainty_phrases = [
            "i'm not sure", "i don't know", "uncertain", "might be",
            "possibly", "perhaps", "i believe", "it seems"
        ]
        response_lower = response.lower()

        has_uncertainty = any(phrase in response_lower for phrase in uncertainty_phrases)

        # Basic scoring logic
        if has_uncertainty:
            return 0.7  # Showing appropriate uncertainty
        return 0.5  # Needs more sophisticated analysis


class NextTokenCapture:
    """Capture next token predictions and logprobs for analysis."""

    def __init__(self, name: str = "next_token_capture", top_k: int = 10):
        self.name = name
        self.description = "Capture next token predictions and their probabilities"
        self.top_k = top_k

    def format_request(self, prompt: str, max_tokens: int = 1,
                      temperature: float = 1.0, logprobs: bool = True) -> Dict[str, Any]:
        """Format a request to capture next token logprobs.

        Args:
            prompt: The input prompt (e.g., "The sky is")
            max_tokens: Number of tokens to generate (usually 1 for next token)
            temperature: Sampling temperature
            logprobs: Whether to return logprobs

        Returns:
            Dict formatted for API request with logprob capture enabled
        """
        return {
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "logprobs": self.top_k if logprobs else None,  # Request top K logprobs
            "echo": False,  # Don't echo the prompt
            "stop": None,  # No stop sequences for single token
            "metadata": {
                "capture_type": "next_token",
                "top_k": self.top_k
            }
        }

    def parse_logprobs(self, response: Dict[str, Any]) -> Dict[str, Any]:
        """Parse API response to extract token probabilities.

        Args:
            response: API response with logprobs

        Returns:
            Dict with parsed token probabilities and analysis
        """
        result = {
            "generated_token": "",
            "generated_text": "",
            "top_tokens": [],
            "probability_distribution": {},
            "metadata": {}
        }

        # Handle OpenAI-style response
        if "choices" in response and response["choices"]:
            choice = response["choices"][0]
            result["generated_text"] = choice.get("text", "")

            if "logprobs" in choice and choice["logprobs"]:
                logprobs = choice["logprobs"]

                # Get the first token's logprobs (next token)
                if "tokens" in logprobs and logprobs["tokens"]:
                    result["generated_token"] = logprobs["tokens"][0]

                if "top_logprobs" in logprobs and logprobs["top_logprobs"]:
                    # First element contains the top K tokens for the next position
                    top_probs = logprobs["top_logprobs"][0]

                    # Convert log probabilities to probabilities
                    import math
                    for token, logprob in top_probs.items():
                        prob = math.exp(logprob)
                        result["top_tokens"].append({
                            "token": token,
                            "logprob": logprob,
                            "probability": prob,
                            "percentage": f"{prob * 100:.2f}%"
                        })

                    # Sort by probability
                    result["top_tokens"].sort(key=lambda x: x["probability"], reverse=True)

                    # Create distribution map
                    result["probability_distribution"] = {
                        t["token"]: t["probability"] for t in result["top_tokens"]
                    }

        # Handle other API formats (Anthropic, etc.)
        elif "completion" in response:
            result["generated_text"] = response.get("completion", "")
            # Note: Anthropic doesn't provide logprobs by default
            result["metadata"]["note"] = "Logprobs not available for this API"

        return result

    def analyze_distribution(self, logprobs_data: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze the probability distribution of tokens.

        Args:
            logprobs_data: Parsed logprobs data

        Returns:
            Analysis of the token distribution
        """
        analysis = {
            "entropy": 0.0,
            "top_token": None,
            "top_probability": 0.0,
            "concentration": "low",  # low, medium, high
            "alternative_count": 0,
            "cumulative_top3": 0.0,
            "cumulative_top5": 0.0
        }

        if not logprobs_data.get("top_tokens"):
            return analysis

        top_tokens = logprobs_data["top_tokens"]

        # Get top token info
        if top_tokens:
            analysis["top_token"] = top_tokens[0]["token"]
            analysis["top_probability"] = top_tokens[0]["probability"]
            analysis["alternative_count"] = len(top_tokens) - 1

        # Calculate entropy (measure of uncertainty)
        import math
        probs = [t["probability"] for t in top_tokens]
        analysis["entropy"] = -sum(p * math.log(p) for p in probs if p > 0)

        # Calculate cumulative probabilities
        if len(probs) >= 3:
            analysis["cumulative_top3"] = sum(probs[:3])
        if len(probs) >= 5:
            analysis["cumulative_top5"] = sum(probs[:5])

        # Determine concentration level
        if analysis["top_probability"] > 0.8:
            analysis["concentration"] = "high"
        elif analysis["top_probability"] > 0.4:
            analysis["concentration"] = "medium"
        else:
            analysis["concentration"] = "low"

        return analysis

    def format_display(self, prompt: str, logprobs_data: Dict[str, Any],
                      analysis: Dict[str, Any]) -> str:
        """Format the results for display.

        Args:
            prompt: Original prompt
            logprobs_data: Parsed logprobs data
            analysis: Distribution analysis

        Returns:
            Formatted string for display
        """
        lines = [
            f"Prompt: '{prompt}'",
            f"Generated: '{logprobs_data.get('generated_text', 'N/A')}'",
            "",
            "Top 10 Next Token Predictions:",
            "-" * 40
        ]

        for i, token_data in enumerate(logprobs_data.get("top_tokens", [])[:10], 1):
            token = token_data["token"]
            prob = token_data["probability"]
            percent = token_data["percentage"]

            # Escape special characters for display
            display_token = repr(token)[1:-1]  # Remove quotes from repr
            lines.append(f"{i:2}. '{display_token}': {percent} (prob: {prob:.4f})")

        lines.extend([
            "",
            "Distribution Analysis:",
            "-" * 40,
            f"Entropy: {analysis['entropy']:.3f}",
            f"Top token probability: {analysis['top_probability']:.3f}",
            f"Concentration: {analysis['concentration']}",
            f"Cumulative top 3: {analysis['cumulative_top3']:.3f}",
            f"Cumulative top 5: {analysis['cumulative_top5']:.3f}"
        ])

        return "\n".join(lines)


# Generic Benchmark Utilities
def calculate_accuracy_by_category(results: List[Dict[str, Any]],
                                 category_field: str = "category") -> Dict[str, Dict[str, float]]:
    """
    Calculate accuracy statistics by category.

    Args:
        results: List of evaluation results with scores and categories
        category_field: Field name containing category information

    Returns:
        Dict with category statistics
    """
    category_stats = {}

    for result in results:
        category = result.get(category_field, "unknown")
        score = result.get("score", 0.0)

        if category not in category_stats:
            category_stats[category] = {"total": 0, "correct": 0, "scores": []}

        category_stats[category]["total"] += 1
        category_stats[category]["scores"].append(score)

        if score >= 0.5:  # Consider >= 0.5 as correct for most metrics
            category_stats[category]["correct"] += 1

    # Calculate final statistics
    final_stats = {}
    for category, stats in category_stats.items():
        total = stats["total"]
        correct = stats["correct"]
        scores = stats["scores"]

        final_stats[category] = {
            "total": total,
            "correct": correct,
            "accuracy": correct / total if total > 0 else 0.0,
            "average_score": sum(scores) / len(scores) if scores else 0.0,
            "score_std": _calculate_std(scores) if len(scores) > 1 else 0.0
        }

    return final_stats


def _calculate_std(values: List[float]) -> float:
    """Calculate standard deviation."""
    if len(values) <= 1:
        return 0.0

    mean = sum(values) / len(values)
    variance = sum((x - mean) ** 2 for x in values) / (len(values) - 1)
    return variance ** 0.5


def format_benchmark_summary(results: List[Dict[str, Any]], benchmark_name: str) -> str:
    """
    Format benchmark results into a readable summary.

    Args:
        results: List of evaluation results
        benchmark_name: Name of the benchmark

    Returns:
        Formatted summary string
    """
    if not results:
        return f"{benchmark_name} Benchmark: No results"

    total = len(results)
    scores = [r.get("score", 0.0) for r in results]
    correct = sum(1 for s in scores if s >= 0.5)

    accuracy = correct / total
    avg_score = sum(scores) / len(scores)

    summary = f"""
{benchmark_name} Benchmark Results
{'=' * (len(benchmark_name) + 19)}

Overall Performance:
  Total Questions: {total}
  Correct Answers: {correct}
  Accuracy: {accuracy:.1%}
  Average Score: {avg_score:.3f}

Score Distribution:
  Perfect (1.0): {sum(1 for s in scores if s == 1.0)}
  Good (≥0.8): {sum(1 for s in scores if 0.8 <= s < 1.0)}
  Fair (≥0.5): {sum(1 for s in scores if 0.5 <= s < 0.8)}
  Poor (<0.5): {sum(1 for s in scores if s < 0.5)}
""".strip()

    # Add category breakdown if available
    if any("category" in r for r in results):
        category_stats = calculate_accuracy_by_category(results)
        summary += "\n\nCategory Breakdown:\n"
        for category, stats in sorted(category_stats.items()):
            summary += f"  {category}: {stats['accuracy']:.1%} ({stats['correct']}/{stats['total']})\n"

    return summary


# Dataset Loading Utilities
def load_dataset_from_json(file_path: str) -> List[Dict[str, Any]]:
    """Load dataset from JSON file."""
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Handle different JSON structures
    if isinstance(data, list):
        return data
    elif isinstance(data, dict):
        # Check for common keys that contain the actual data
        for key in ['data', 'items', 'questions', 'examples', 'samples']:
            if key in data and isinstance(data[key], list):
                return data[key]

    # If structure is unclear, wrap in list
    return [data]


def load_dataset_from_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """Load dataset from JSONL file."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def load_dataset_from_url(url: str, format: str = "auto") -> List[Dict[str, Any]]:
    """Load dataset from URL using centralized HTTP client."""
    from tldw_Server_API.app.core.http_client import fetch, fetch_json

    if format == "auto":
        # best-effort detect based on extension
        if url.endswith('.jsonl'):
            format = "jsonl"
        else:
            format = "json"

    if format == "jsonl":
        r = fetch(method="GET", url=url, timeout=15)
        try:
            lines = r.text.strip().split('\n')
            return [json.loads(line) for line in lines if line]
        finally:
            try:
                r.close()
            except Exception:
                pass
    else:
        data = fetch_json(method="GET", url=url, timeout=15)
        return data if isinstance(data, list) else [data]
